home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
3D GFX
/
3D GFX.iso
/
amiutils
/
e_h
/
flick
/
src
/
c2p_040.s
< prev
next >
Wrap
Text File
|
1995-12-30
|
9KB
|
396 lines
; Chunky2Planar algorithm, originally by James McCoull
; Modified by Peter McGavin for variable size and depth
; and "dirty list" (hope I didn't slow it down too much)
;
; Cpu only solution VERSION 2
; Optimised for 040+fastram
; bitplanes are assumed contiguous!
; analyse instruction offsets to check performance
;void __asm c2p_8_040 (register __a0 UBYTE *chunky_data,
; register __a1 PLANEPTR raster,
; register __a2 UBYTE *dirty_list,
; register __d1 ULONG plsiz);
; a0 -> width*height chunky pixels
; a1 -> contiguous bitplanes
; a2 -> dirty list (1-byte flag for whether each 32 pixel "unit" needs updating)
; d1 = width*height/8 (width*height must be a multiple of 32)
ifeq depth-8
xdef _c2p_8_040
_c2p_8_040:
else
ifeq depth-6
xdef _c2p_6_040
_c2p_6_040:
else
ifeq depth-4
xdef _c2p_4_040
_c2p_4_040:
else
fail "unsupported depth!"
endc
endc
endc
merge macro ; in1,in2,tmp3,tmp4,mask,shift
; \1 = abqr
; \2 = ijyz
move.l \2,\4
move.l #\5,\3
and.l \3,\2 ; \2 = 0j0z
and.l \1,\3 ; \3 = 0b0r
eor.l \3,\1 ; \1 = a0q0
eor.l \2,\4 ; \4 = i0y0
ifeq \6-1
add.l \3,\3
else
lsl.l #\6,\3 ; \3 = b0r0
endc
lsr.l #\6,\4 ; \4 = 0i0y
or.l \3,\2 ; \2 = bjrz
or.l \4,\1 ; \1 = aiqy
endm
merge4 macro ; in1,in2,tmp3,tmp4,mask
ifgt depth-4
merge \1,\2,\3,\4,\5,4
else
move.l #\5,\3 ; \3 = mask
and.l \3,\2 ; \2 = 0j0z
and.l \1,\3 ; \3 = 0b0r
lsl.l #4,\3 ; \3 = b0r0
or.l \3,\2 ; \2 = bjrz
endc
endm
start:
; jmp next ; self-modified code here
;next:
; movem.l d1/a0-a2,-(sp)
;; relocate c2p to a 16-aligned address
; lea (c2p,pc),a0
; move.l a0,d0
; and.b #%11110000,d0
; move.l d0,a1
;
;; patch jmp
; move.l d0,start+2
; move.w #(end-c2p)-1,d0
;loop: move.b (a0)+,(a1)+
; dbra d0,loop
;
;; flush cache
; move.l (4).w,a6
; jsr (_LVOCacheClearU,a6)
;
;; restore parameters and restart
; movem.l (sp)+,d1/a0-a2
; bra.b start
;
; ds.w 8 ; space for relocation
; the real c2p routine starts here
c2p:
movem.l d2-d7/a2-a6,-(sp)
sub.w #44,sp ; space for temporary variables
; a0 = chunky buffer
; a1 = output area
; a2 = dirty list
; d1 = plsiz
movea.l d1,a3 ; a3 = plsiz
move.l a0,a4
lsl.l #3,d1
add.l d1,a4 ; a4 -> end of chunky data
first_loop: tst.l (a2)+ ; do the next 128 pixels need updating?
bne.b first_patch ; branch if yes
adda.w #128,a0 ; skip 128 pixels on input
adda.w #16,a1 ; skip 128 pixels on output
cmpa.l a0,a4
bcc.b first_loop
bra.w exit ; exit if no changes found
first_patch: subq.l #4,a2 ; restore input address
tst.b (a2)+ ; do the next 32 pixels need updating?
bne.b first_case ; branch if yes
adda.w #32,a0 ; skip 32 pixels on input
addq.l #4,a1 ; skip 32 pixels on output
tst.b (a2)+ ; do the next 32 pixels need updating?
bne.b first_case ; branch if yes
adda.w #32,a0 ; skip 32 pixels on input
addq.l #4,a1 ; skip 32 pixels on output
tst.b (a2)+ ; do the next 32 pixels need updating?
bne.b first_case ; branch if yes
adda.w #32,a0 ; skip 32 pixels on input
addq.l #4,a1 ; skip 32 pixels on output
tst.b (a2)+ ; do the next 32 pixels need updating?
bne.b first_case ; branch if yes
adda.w #32,a0 ; skip 32 pixels on input
addq.l #4,a1 ; skip 32 pixels on output
bra.b first_loop ; this should never happen
first_case: move.l (0,a0),d1
move.l (4,a0),d3
move.l (8,a0),d0
move.l (12,a0),d2
move.l (2,a0),d4
move.l (10,a0),d5
move.l (6,a0),d6
move.l (14,a0),d7
move.w (16,a0),d1
move.w (24,a0),d0
move.w (20,a0),d3
move.w (28,a0),d2
move.w (18,a0),d4
move.w (26,a0),d5
move.w (22,a0),d6
move.w (30,a0),d7
adda.w #32,a0
move.l d6,a5
move.l d7,a6
merge d1,d0,d6,d7,$00ff00ff,8
merge d3,d2,d6,d7,$00ff00ff,8
merge4 d1,d3,d6,d7,$0f0f0f0f,4
merge4 d0,d2,d6,d7,$0f0f0f0f,4
exg d1,a5
exg d0,a6
merge d4,d5,d6,d7,$00ff00ff,8
merge d1,d0,d6,d7,$00ff00ff,8
merge4 d4,d1,d6,d7,$0f0f0f0f,4
merge4 d5,d0,d6,d7,$0f0f0f0f,4
merge d3,d1,d6,d7,$33333333,2
merge d2,d0,d6,d7,$33333333,2
merge d3,d2,d6,d7,$55555555,1
merge d1,d0,d6,d7,$55555555,1
move.l d0,(0*4,sp) ;plane0 (movem.l is slower!)
move.l d1,(1*4,sp) ;plane1
move.l d2,(2*4,sp) ;plane2
move.l d3,(3*4,sp) ;plane3
ifgt depth-4
move.l a5,d3
move.l a6,d2
merge d3,d4,d6,d7,$33333333,2
merge d2,d5,d6,d7,$33333333,2
ifgt depth-6
merge d3,d2,d6,d7,$55555555,1
endc
merge d4,d5,d6,d7,$55555555,1
move.l d5,(4*4,sp) ;plane4
move.l d4,(5*4,sp) ;plane5
ifgt depth-6
move.l d2,(6*4,sp) ;plane6
move.l d3,(7*4,sp) ;plane7
endc
endc
move.l a1,(32,sp) ; save output address
addq.l #4,a1 ; skip 32 pixels on output
cmpa.l a0,a4
beq.w final_case
main_loop: tst.l (a2)+ ; do the next 128 pixels need updating?
bne.b main_patch ; branch if yes
adda.w #128,a0 ; skip 128 pixels on input
adda.w #16,a1 ; skip 128 pixels on output
cmpa.l a0,a4
bcc.b main_loop
bra.w final_case ; exit if no changes found
main_patch: subq.l #4,a2 ; restore input address
tst.b (a2)+ ; do the next 32 pixels need updating?
beq.b 1$ ; branch if no
bsr.b main_case
1$: adda.w #32,a0 ; skip 32 pixels on input
addq.l #4,a1 ; skip 32 pixels on output
tst.b (a2)+ ; do the next 32 pixels need updating?
beq.b 2$ ; branch if no
bsr.b main_case
2$: adda.w #32,a0 ; skip 32 pixels on input
addq.l #4,a1 ; skip 32 pixels on output
tst.b (a2)+ ; do the next 32 pixels need updating?
beq.b 3$ ; branch if no
bsr.b main_case
3$: adda.w #32,a0 ; skip 32 pixels on input
addq.l #4,a1 ; skip 32 pixels on output
tst.b (a2)+ ; do the next 32 pixels need updating?
beq.b 4$ ; branch if no
bsr.b main_case
4$: adda.w #32,a0 ; skip 32 pixels on input
addq.l #4,a1 ; skip 32 pixels on output
cmpa.l a0,a4
bcc.b main_loop
bra.w final_case ; exit if no changes found
main_case:
move.l a1,(36+4,sp) ; save current output address
move.l (32+4,sp),a1 ; a1 = previous output address
move.l (0,a0),d1
move.l (4,a0),d3
move.l (8,a0),d0
move.l (12,a0),d2
move.l (2,a0),d4
move.l (10,a0),d5
move.l (6,a0),d6
move.l (14,a0),d7
move.w (16,a0),d1
move.w (24,a0),d0
move.w (20,a0),d3
move.w (28,a0),d2
move.w (18,a0),d4
move.w (26,a0),d5
move.w (22,a0),d6
move.w (30,a0),d7
move.l d6,a5
move.l d7,a6
move.l (0*4+4,sp),(a1) ;plane0
adda.l a3,a1 ;a1+=plsiz
merge d1,d0,d6,d7,$00ff00ff,8
merge d3,d2,d6,d7,$00ff00ff,8
move.l (1*4+4,sp),(a1) ;plane1
adda.l a3,a1 ;a1+=plsiz
merge d1,d3,d6,d7,$0f0f0f0f,4
merge d0,d2,d6,d7,$0f0f0f0f,4
exg d1,a5
exg d0,a6
move.l (2*4+4,sp),(a1) ;plane2
adda.l a3,a1 ;a1+=plsiz
merge d4,d5,d6,d7,$00ff00ff,8
merge d1,d0,d6,d7,$00ff00ff,8
move.l (3*4+4,sp),(a1) ;plane3
adda.l a3,a1 ;a1+=plsiz
merge d4,d1,d6,d7,$0f0f0f0f,4
merge d5,d0,d6,d7,$0f0f0f0f,4
ifgt depth-4
move.l (4*4+4,sp),(a1) ;plane4
adda.l a3,a1 ;a1+=plsiz
endc
merge d3,d1,d6,d7,$33333333,2
merge d2,d0,d6,d7,$33333333,2
ifgt depth-4
move.l (5*4+4,sp),(a1) ;plane5
adda.l a3,a1 ;a1+=plsiz
endc
merge d3,d2,d6,d7,$55555555,1
merge d1,d0,d6,d7,$55555555,1
move.l d0,(0*4+4,sp) ;plane0 (movem.l is slower!)
move.l d1,(1*4+4,sp) ;plane1
move.l d2,(2*4+4,sp) ;plane2
move.l d3,(3*4+4,sp) ;plane3
ifgt depth-4
move.l a5,d3
move.l a6,d2
ifgt depth-6
move.l (6*4+4,sp),(a1) ;plane6
adda.l a3,a1 ;a1+=plsiz
endc
merge d3,d4,d6,d7,$33333333,2
merge d2,d5,d6,d7,$33333333,2
ifgt depth-6
move.l (7*4+4,sp),(a1) ;plane7
adda.l a3,a1 ;a1+=plsiz
endc
ifgt depth-6
merge d3,d2,d6,d7,$55555555,1
endc
merge d4,d5,d6,d7,$55555555,1
move.l d5,(4*4+4,sp) ;plane4
move.l d4,(5*4+4,sp) ;plane5
ifgt depth-6
move.l d2,(6*4+4,sp) ;plane6
move.l d3,(7*4+4,sp) ;plane7
endc
endc
movea.l (36+4,sp),a1 ; restore current output address
move.l a1,(32+4,sp) ; save output address
rts
final_case: move.l (32,sp),a1 ; a1 = previous output address
move.l (0*4,sp),(a1) ;plane0
adda.l a3,a1 ;a1+=plsiz
move.l (1*4,sp),(a1) ;plane1
adda.l a3,a1 ;a1+=plsiz
move.l (2*4,sp),(a1) ;plane2
adda.l a3,a1 ;a1+=plsiz
move.l (3*4,sp),(a1) ;plane3
ifgt depth-4
adda.l a3,a1 ;a1+=plsiz
move.l (4*4,sp),(a1) ;plane4
adda.l a3,a1 ;a1+=plsiz
move.l (5*4,sp),(a1) ;plane5
ifgt depth-6
adda.l a3,a1 ;a1+=plsiz
move.l (6*4,sp),(a1) ;plane6
adda.l a3,a1 ;a1+=plsiz
move.l (7*4,sp),(a1) ;plane7
endc
endc
exit: add.w #44,sp
movem.l (sp)+,d2-d7/a2-a6
rts
cnop 0,4
end:
end